Load generic libraries
source('configuration.r')
Load plot specific libraries
library(foreach)
library(readr)
suppressMessages(library(igraph))
library(ggraph)
library(ggalluvial)
Create graph edge data
dat <- read.table('../tables/antibiotics_gene_linkage.tsv', head=FALSE, stringsAsFactors = FALSE) %>%
mutate(id=paste0(V1,':',V2, ':', V3)) %>%
mutate(V6=str_replace(V6, 'PheCmlA5', 'Phe')) %>%
mutate(V6=str_replace(V6, 'Far1_Fcd', 'Far1_Bla'))
edges.full <- foreach(id=unique(dat$id), .combine=rbind) %do% {
tmp <- dat[dat$id == id, ]
if(nrow(tmp) < 2) {
NULL
}else{
edge <- data.frame(t(combn(sort(tmp$V6), 2)), id=id)
edge$dist <- foreach(r=1:nrow(edge), .combine = c) %do% {
g1 <- sort(tmp[tmp$V6 == edge[r, ]$X1, 4:5])
g2 <- sort(tmp[tmp$V6 == edge[r, ]$X2, 4:5])
min(abs(g1[2]-g2[1]), abs(g1[1]-g2[2]))
}
edge
}
}
## save for ad hoc analysis
write.table(edges.full, '../output_tables/ar_gene_linkage_edge_list_uncollapsed.tsv', row.names = F, quote=F, sep='\t')
Function for ploting
get_graph_data <- function(pattern='', reverse=FALSE, dist.fil=Inf){
if(reverse){
edges.collapsed <- filter(edges.full, !str_detect(id, pattern))
}else{
edges.collapsed <- filter(edges.full, str_detect(id, pattern))
}
## filtering and collapse into edge weights
edges.collapsed <- edges.collapsed %>%
group_by(X1, X2) %>%
summarise(n=length(dist), weight=sum(dist<dist.fil)) %>%
filter(n>1) ### keep edges with more than 1 support
## Run MCL graph clustering
write.table(edges.collapsed %>% filter(weight>0) %>% select(-n),
'tmp_edges.tsv', quote=F, sep='\t', col.names = F, row.names = F)
system('/mnt/software/bin/mcl tmp_edges.tsv --abc -o tmp_mcl.tsv -overlap keep -I 3 2>/dev/null')
mcl <- read_tsv('tmp_mcl.tsv', col_names = F)
grp <- foreach(c=1:nrow(mcl), .combine = 'rbind') %do%{
tmp <- as.character(na.exclude(as.character(mcl[c,])))
data.frame(vertex=tmp, grp=c, stringsAsFactors = F)
} %>% data.frame(row.names = 1)
grp$grp[grp$grp >= (which(table(grp) < 3)[1])] <- NA ## no annotation for clusters with 2 genes only
## generate graph data
g <- graph_from_data_frame(edges.collapsed[,1:3], directed=FALSE)
## vertices
colors <- pal_simpsons('springfield')(16)
n.colors <- colors[as.numeric(as.factor(str_replace(V(g)$name, '.*_', '')))]
V(g)$color <- n.colors
V(g)$community <- grp[V(g)$name,]##cluster_walktrap(g)$membership
V(g)$Type <- str_replace(V(g)$name, '.*_', '')
## Edges
E(g)$width <- edges.collapsed$n
E(g)$community <-
apply(as.data.frame(get.edgelist(g, names=FALSE)), 1,
function(x)
ifelse(V(g)$community[x[1]] == V(g)$community[x[2]],
LETTERS[V(g)$community[x[1]]], NA))
## format vertex names
aux <- function(x){
if(length(x) > 2){
paste0(x[1],'_',x[2])
}else{
x[1]
}
}
V(g)$name <- sapply(str_split(V(g)$name, '_'), aux)
#### carbapenemase
args <- read.table('../metadata/Bla_Carb.dat', head=F, stringsAsFactors=F)[,1]
carb <- setNames(rep('No', length(V(g)$name)),V(g)$name)
carb[names(carb) %in% args] <- 'Yes'
V(g)$carb <- carb
g
}
Main figure of all genes
g <- get_graph_data(dist.fil = 5e3)
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_character(),
## X7 = col_character(),
## X8 = col_character(),
## X9 = col_character(),
## X10 = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 15 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 4 <NA> 10 columns 6 columns 'tmp_mcl.tsv' file 2 5 <NA> 10 columns 5 columns 'tmp_mcl.tsv' row 3 6 <NA> 10 columns 5 columns 'tmp_mcl.tsv' col 4 7 <NA> 10 columns 5 columns 'tmp_mcl.tsv' expected 5 8 <NA> 10 columns 5 columns 'tmp_mcl.tsv'
## ... ................. ... ................................................ ........ ................................................ ...... ................................................ .... ................................................ ... ................................................ ... ................................................ ........ ................................................
## See problems(...) for more details.
g_cols <- c((pal_lancet("lanonc")(9))[c(2:8,1)], pal_ucscgb("default")(26))[-c(2,7,8,9,10,12,17,18,19,20,21,23,24,26,27)]
## plot(1:19, 1:19, type="p", pch=65:(65+19-1), cex=2, col=g_cols) ## get legend alphabets
layout = create_layout(g, layout = 'igraph', algorithm = 'kk', kkconst=0.1)
p <- ggraph(layout)+#, circular=TRUE) +
geom_edge_arc(aes(width=((!is.na(community))+1), alpha=as.factor(is.na(community)), color=community),
curvature = 0.0,
#alpha=0.7,
end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm')) +
scale_edge_color_manual(values=g_cols,
na.value = rgb(0.8,0.8,0.8,0.5)) +
scale_edge_alpha_manual(values=c(0.8, 0.3), guide=FALSE) +
geom_node_point(aes(fill=Type, color=carb),size=5, shape=21) +
scale_color_manual(values=c('grey','firebrick')) +
geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
scale_edge_width(range = c(1,1.5)) +
scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
theme_void()
p
ggsave('../plots/fig3a.ar_gene_linkage_graph.pdf', p, width=15, height=17)
Format data
edge.dat <- data.frame(v=V(g)$name, g=LETTERS[V(g)$community]) %>% filter(!is.na(g))
genome.info <- read.table('../tables/genome_info.dat', sep = '\t')
merge(dat, genome.info, by=c(1,2), all.x=TRUE) %>%
filter(!is.na(V6.y) | V1=='plasmid') %>% ## only take the high/medium quality ones
select(Species=V1, gene=V6.x) %>% mutate(gene=str_replace(gene, '_.*','')) -> sp.dat
sankey.dat <-
merge(sp.dat, edge.dat, by.x=2, by.y=1, all.y=T) %>%
mutate(Species=str_replace(Species,'_',' ')) %>%
group_by(Species, g) %>%
count %>%
mutate(Antibiotics_cluster=as.character(g)) %>%
filter(n>3) %>%
ungroup()
sankey.dat$Species <- factor(sankey.dat$Species, ordered = TRUE, levels = c("Acinetobacter baumannii",
"Acinetobacter sp.",
"Burkholderia lata",
"Enterobacter cloacae",
"Klebsiella pneumoniae",
"Pseudomonas aeruginosa",
"Corynebacterium striatum",
"Enterococcus faecalis",
"Enterococcus faecium",
"Enterococcus sp.",
"Staphylococcus aureus",
"Staphylococcus capitis",
"Staphylococcus cohnii",
"Staphylococcus epidermidis",
"Staphylococcus haemolyticus",
"Staphylococcus hominis",
"Staphylococcus lugdunensis",
"Staphylococcus warneri",
"plasmid"))
sankey.dat$Antibiotics_cluster <- factor(sankey.dat$Antibiotics_cluster, ordered=TRUE, levels=c(
"A", "G", "J", "K", "L", "M", "N", "B", "C", "D", "E", "F", "H", "I"
))
## mutate(Species=sapply(Species, function(x) ifelse(x=='plasmid', NA, x)))
Plot
p <- ggplot(subset(sankey.dat, Species!='plasmid'),aes(y=log10(n), axis1=Species, axis2=Antibiotics_cluster)) +
geom_alluvium(aes(fill=Antibiotics_cluster), width=1/5) +
geom_stratum(width=1/5, fill=NA, lwd=1) +
geom_text(stat = "stratum", label.strata = TRUE, size=10, fontface = "bold", nudge_x=-0.12, hjust=1) +
scale_fill_manual(values=g_cols[match(levels(sankey.dat$Antibiotics_cluster) , LETTERS[1:14])]) +
scale_x_discrete(limits = c("Species", "Antibiotics Cluster"), expand = c(.5, .05)) +
theme_void() +
theme()
p
ggsave('../plots/fig3b.ar_gene_linkage_sankey.pdf', p, width=25, height=25)
Staphylococcus aureus network
g <- get_graph_data('Staphylococcus_aureus')
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_character(),
## X7 = col_character(),
## X8 = col_character(),
## X9 = col_character(),
## X10 = col_character(),
## X11 = col_character(),
## X12 = col_character(),
## X13 = col_character(),
## X14 = col_character(),
## X15 = col_character(),
## X16 = col_character(),
## X17 = col_character(),
## X18 = col_character(),
## X19 = col_character()
## )
p <- ggraph(g, layout = 'fr')+#, circular=TRUE) +
geom_edge_arc(aes(width=width, color=community),
curvature = 0.0,
alpha=0.7,
end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm')) +
scale_edge_color_manual(values=g_cols,
na.value = rgb(0.8,0.8,0.8,0.5)) +
geom_node_point(aes(fill=Type, color=carb),size=5, shape=21) +
scale_color_manual(values=c('grey','firebrick')) +
geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
scale_edge_width(range = c(0.5,3)) +
scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
theme_void()
p
ggsave('../plots/sup11.staph_aureus_ar_gene_linkage_graph.pdf', p, width=5, height=5)
Genome network
g <- get_graph_data('plasmid', reverse = TRUE)
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 9 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 2 <NA> 43 columns 39 columns 'tmp_mcl.tsv' file 2 3 <NA> 43 columns 5 columns 'tmp_mcl.tsv' row 3 4 <NA> 43 columns 4 columns 'tmp_mcl.tsv' col 4 5 <NA> 43 columns 3 columns 'tmp_mcl.tsv' expected 5 6 <NA> 43 columns 3 columns 'tmp_mcl.tsv'
## ... ................. ... ................................................. ........ ................................................. ...... ................................................. .... ................................................. ... ................................................. ... ................................................. ........ .................................................
## See problems(...) for more details.
p <- ggraph(g, layout = 'fr')+#, circular=TRUE) +
geom_edge_arc(aes(width=width, color=community),
curvature = 0.0,
alpha=0.7,
end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm')) +
scale_edge_color_manual(values=g_cols,
na.value = rgb(0.8,0.8,0.8,0.5)) +
geom_node_point(aes(fill=Type, color=carb),size= 5, shape=21) +
scale_color_manual(values=c('grey','firebrick')) +
geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
scale_edge_width(range = c(0.5,3)) +
scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
theme_void()
p
ggsave('../plots/sup12.genome_ar_gene_linkage_graph.pdf', p, width=16, height=16)
Plasmid network
g <- get_graph_data('plasmid')
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 7 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 2 <NA> 22 columns 13 columns 'tmp_mcl.tsv' file 2 3 <NA> 22 columns 9 columns 'tmp_mcl.tsv' row 3 4 <NA> 22 columns 8 columns 'tmp_mcl.tsv' col 4 5 <NA> 22 columns 6 columns 'tmp_mcl.tsv' expected 5 6 <NA> 22 columns 5 columns 'tmp_mcl.tsv'
## ... ................. ... ................................................. ........ ................................................. ...... ................................................. .... ................................................. ... ................................................. ... ................................................. ........ .................................................
## See problems(...) for more details.
p <- ggraph(g, layout = 'fr')+#, circular=TRUE) +
geom_edge_arc(aes(width=width, color=community),
curvature = 0.0,
alpha=0.7,
end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm')) +
scale_edge_color_manual(values=g_cols,
na.value = rgb(0.8,0.8,0.8,0.5)) +
geom_node_point(aes(fill=Type, color=carb),size= as.numeric(as.factor(V(g)$carb) )*5, shape=21) +
scale_color_manual(values=c('grey','firebrick')) +
geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
scale_edge_width(range = c(0.5,3)) +
scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
theme_void()
p
ggsave('../plots/sup12.plasmid_ar_gene_linkage_graph.pdf', p, width=15, height=10)
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
##
## Matrix products: default
## BLAS: /usr/lib64/R/lib/libRblas.so
## LAPACK: /usr/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggalluvial_0.9.1 ggraph_1.0.2 igraph_1.2.2 readr_1.1.1
## [5] foreach_1.4.4 ggsci_2.9 reshape2_1.4.3 stringr_1.3.0
## [9] tibble_2.0.1 tidyr_0.8.2 dplyr_0.8.0.1 gridExtra_2.3
## [13] ggplot2_3.1.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 pillar_1.3.1 compiler_3.4.4
## [4] plyr_1.8.4 viridis_0.5.1 iterators_1.0.9
## [7] tools_3.4.4 digest_0.6.18 viridisLite_0.3.0
## [10] evaluate_0.10.1 gtable_0.2.0 pkgconfig_2.0.2
## [13] rlang_0.3.1 cli_1.0.1 ggrepel_0.8.0
## [16] yaml_2.1.18 withr_2.1.2 knitr_1.20
## [19] hms_0.4.2 rprojroot_1.3-2 tidyselect_0.2.5
## [22] glue_1.3.0 R6_2.4.0 fansi_0.4.0
## [25] rmarkdown_1.9 farver_1.0 tweenr_1.0.0
## [28] purrr_0.3.0 magrittr_1.5 units_0.6-1
## [31] MASS_7.3-49 backports_1.1.2 scales_1.0.0
## [34] codetools_0.2-15 htmltools_0.3.6 assertthat_0.2.0
## [37] ggforce_0.1.3 colorspace_1.3-2 labeling_0.3
## [40] utf8_1.1.4 stringi_1.3.1 lazyeval_0.2.1
## [43] munsell_0.5.0 crayon_1.3.4